2.1 Data Source
- New York City Open Data
- NYPD Complaint Historical Data 2006 to 2017 - NYC Borough Boundaries (JSON)- New York City Planning
- NYC Borough Population 1900 to 2010 - NYC Total Housing Units 1940 to 2010- Foursquare API
2.2 Data Wrangling and Transformation Approach
New York City Open Data
- NYPD Complaint Historical Data 2006 to 2017
Due to space limitation of the Cognitive Class lab environment, This 1.8GB dataset with 6.04 million rows and 35 columns had to be directly downloaded into a personal computer and loaded into a Microsoft Access DB so data required can be extracted and transformed into a format required using SQL.
Source: https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i- NYC Borough Boundaries (JSON)
Enconding format had to be enhanced a bit to be able to incorporate Crime Data in generating the Python Folium Choropleth Map.
Source: https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zmNew York City Planning
- NYC Borough Total Population 1900 to 2010
Minimal transformation required.
Source: https://www1.nyc.gov/site/planning/data-maps/nyc-population/historical-population.page- NYC Borough Total Housing Units 1940 to 2010
Minimal transformation required.
Source: https://www1.nyc.gov/site/planning/data-maps/nyc-population/historical-population.pageFoursquare API
A Foursquare developer account was opened in order to take advantage of its API which provides social networking location information about venues, users, and check-ins. The API enables the research to perform k-means clustering of New York City underlying neighborhood.
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
from folium import plugins
import numpy as np
import requests # library to handle requests
import pandas as pd
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geopy.distance
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
#import seaborn as sns
mpl.style.use('ggplot') # optional: for ggplot-like style
import json # library to handle JSON files
# import k-means from clustering stage
from sklearn.cluster import KMeans
import os
import webbrowser
from functools import reduce
print('Required libraries imported...')
df_complaint_data = pd.read_csv('nypd_complaint_data_2017.csv')
complaint_data = df_complaint_data
complaint_data.drop(['Cmplnt_Num', 'Month', 'Offense', 'Latitude', 'Longitude'], axis=1, inplace=True)
complaint_data = complaint_data.apply(pd.Series.value_counts).reset_index()
complaint_data.rename(columns={'index':'Neighborhood', 'Borough':'Crime'}, inplace=True)
address = 'New York, NY'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
nyc_lat = location.latitude
nyc_lon = location.longitude
nyc_zones = r'borough_boundaries.json'
nyc_complaint_map = folium.Map(location=[nyc_lat, nyc_lon],
zoom_start=10,
tiles='OpenStreetMap')
nyc_complaint_map.choropleth(
geo_data=nyc_zones,
data=complaint_data,
columns=['Neighborhood', 'Crime'],
key_on='feature.properties.boro_name',
fill_color='YlOrRd',
fill_opacity=0.5,
line_opacity=0.5,
legend_name='2017 New York Boroughs Complaint Data',
threshold_scale=[25000, 50000, 75000, 100000, 125000, 150000]
)
nyc_complaint_map.save('nyc_complaint_map.html')
nyc_complaint_map
df_population_data = pd.read_csv('nyc_total_population_1900-2010.csv')
population_data = df_population_data.set_index('Year')
population_data
population_data.plot(kind='area', alpha=0.3, figsize=(18, 8), stacked=False, linewidth=3) # area plo
plt.title("New York City Population from 1900 to 2010", loc="left", fontsize=14)
plt.legend(loc="upper left")
plt.ylabel("POPULATION", fontsize=12)
plt.xlabel("YEAR", fontsize=12)
plt.savefig('nyc_population.png')
plt.show()
df_housing_data = pd.read_csv('nyc_housing_unit_1940-2010.csv')
housing_data = df_housing_data.set_index('Year')
housing_data
housing_data.plot(kind='area', alpha=0.3, figsize=(18, 8), stacked=False, linewidth=3) # area plo
plt.title("New York City Housing Unit from 1940 to 2010", loc="left", fontsize=14)
plt.legend(loc="upper left")
plt.ylabel("HOUSING UNIT", fontsize=14)
plt.xlabel("YEAR", fontsize=14)
plt.savefig('nyc_housing.png')
plt.show()
df_complaint_hist = pd.read_csv('nypd_complaint_data_2006-2017.csv')
complaint_hist = df_complaint_hist.set_index('Year')
complaint_hist
complaint_hist.plot(kind='area', alpha=0.5, figsize=(18, 8), stacked=False, linewidth=3) # area plo
plt.title("NYPD Complaint Data from 2006 to 2017", loc="left", fontsize=16)
plt.legend(loc="upper left")
plt.ylabel("COMPLAINTS", fontsize=14)
plt.xlabel("YEAR", fontsize=14)
plt.savefig('nyc_complaint.png')
plt.show()
df_complaint_smry = pd.read_csv('nyc_complaint_data_comparison_2017.csv', index_col=0)
df_complaint_smry
colors_list = ['#f7dc6f','#d68910','#a93226']
# Change this line to plot percentages instead of absolute values
ax = (df_complaint_smry.div(df_complaint_smry.sum(1), axis=0)).plot(kind='bar', alpha=0.65, figsize=(18,8), width = 0.95, color = colors_list)
plt.legend(labels=df_complaint_smry.columns,fontsize=14, loc=1)
plt.title("New York Boroughs 2017 Complaint Data Comparison", fontsize=14)
plt.legend(loc='upper left')
plt.xticks(fontsize=12)
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.yticks([])
# Add this loop to add the annotations
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate('{:.2%}'.format(height), (p.get_x()+.15*width, p.get_y() + height + 0.01))
plt.savefig('nyc_complaint_comparison.png')
df_complaint_data = pd.read_csv('nypd_complaint_data_2017.csv')
df_complaint_data = df_complaint_data[df_complaint_data['Borough'] == 'Queens'].reset_index(drop=True)
df_complaint_data = df_complaint_data.sample(frac=.0025, random_state=0)
borough_data = df_complaint_data
borough_data.drop(['Cmplnt_Num', 'Month'], axis=1, inplace=True)
borough_data.shape
address = 'Queens, NY'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
borough_lat = location.latitude
borough_lon = location.longitude
borough_map = folium.Map(location=[borough_lat, borough_lon],
zoom_start=11,
tiles='OpenStreetMap')
incidents = plugins.MarkerCluster().add_to(borough_map)
# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(borough_data.Latitude, borough_data.Longitude, borough_data.Offense):
folium.Marker(
location=[lat, lng],
icon=None,
popup=label,
).add_to(incidents)
borough_map.save('borough_map.html')
# display map
borough_map
crime_data = df_complaint_data
crime_data['Neighborhood'] = ''
def reverse_geocode(coordinates):
# Google Map API call to derive JSON address given longitude and latitude
api_key = 'AIzaSyAC17OaVd0IOBC0cXktIKBdUepzI5JpciQ'
url = 'https://maps.googleapis.com/maps/api/geocode/json?latlng={0}&key={1}'
r = url.format(coordinates, api_key)
data = json.loads(requests.get(r).text)
# loop over JSON address components to get city location
neighborhood = ''
if len(data['results']) > 0:
for component in data['results'][0]['address_components']:
if 'neighborhood' in component['types']:
neighborhood = component['long_name']
elif 'locality' in component['types']:
neighborhood = component['long_name']
elif 'postal_town' in component['types']:
neighborhood = component['long_name']
else:
continue
return neighborhood
for i, row in crime_data.iterrows():
crime_data['Neighborhood'][i] = reverse_geocode(crime_data['Latitude'][i].astype(str) + ',' + crime_data['Longitude'][i].astype(str))
crime_data.head()
crime_aggr = crime_data.groupby(['Borough', 'Neighborhood'])['Offense'].count().reset_index()
crime_aggr
#!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
#
#with open('newyork_data.json') as json_data:
# newyork_data = json.load(json_data)
#
#neighborhoods_data = newyork_data['features']
#
# define the dataframe columns
#column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
#
# instantiate the dataframe
#neighborhoods = pd.DataFrame(columns=column_names)
#
#for data in neighborhoods_data:
# borough = neighborhood_name = data['properties']['borough']
# neighborhood_name = data['properties']['name']
#
# neighborhood_latlon = data['geometry']['coordinates']
# neighborhood_lat = neighborhood_latlon[1]
# neighborhood_lon = neighborhood_latlon[0]
#
# neighborhoods = neighborhoods.append({'Borough': borough,
# 'Neighborhood': neighborhood_name,
# 'Latitude': neighborhood_lat,
# 'Longitude': neighborhood_lon}, ignore_index=True)
#
#borough_data = neighborhoods[neighborhoods['Borough'] == 'Queens'].reset_index(drop=True)
#borough_data.to_csv('queens_borough_data.csv')
borough_data = pd.read_csv('queens_borough_data.csv')
borough_data
# create map of given borough latitude and longitude values
borough_map = folium.Map(location=[borough_lat, borough_lon], zoom_start=10)
# add markers to map
for lat, lng, label in zip(borough_data['Latitude'], borough_data['Longitude'], borough_data['Neighborhood']):
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(borough_map)
borough_map.save('borough_cluster_from.html')
borough_map
# @hidden_cell
CLIENT_ID = 'OYCZWB2YCGWZTR5XWBJCOIBZDH4AWI0X5YQ00A3G0OOQITV3' # your Foursquare ID
CLIENT_SECRET = 'B5D2U2XKJPGO5ZA4ESDU5KDHN4NU45GFL4KOI2D5RVRS3GY2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
borough_data.loc[0, 'Neighborhood']
neighborhood_latitude = borough_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = borough_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = borough_data.loc[0, 'Neighborhood'] # neighborhood name
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 2500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
neighborhood_latitude,
neighborhood_longitude,
radius,
LIMIT)
results = requests.get(url).json()
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
# print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
borough_venues = getNearbyVenues(names=borough_data['Neighborhood'],
latitudes=borough_data['Latitude'],
longitudes=borough_data['Longitude']
)
print(borough_venues.shape)
borough_venues.head()
df_borough_venues = pd.DataFrame(borough_venues, columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'])
df_borough_venues.to_csv('borough_venues.csv')
borough_venues
#borough_venues.groupby(['Neighborhood',
# 'Neighborhood Latitude',
# 'Neighborhood Longitude',
# 'Venue Category']
# )['Venue'].count()
borough_venues_aggr = borough_venues.groupby(['Neighborhood'])['Venue'].count().reset_index()
print('There are {} uniques categories.'.format(len(borough_venues['Venue Category'].unique())))
# one hot encoding
borough_onehot = pd.get_dummies(borough_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
borough_onehot['Neighborhood'] = borough_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [borough_onehot.columns[-1]] + list(borough_onehot.columns[:-1])
borough_onehot = borough_onehot[fixed_columns]
borough_grouped = borough_onehot.groupby('Neighborhood').mean().reset_index()
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = borough_grouped['Neighborhood']
for ind in np.arange(borough_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(borough_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted
# set number of clusters
kclusters = 5
borough_grouped_clustering = borough_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(borough_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
borough_merged = borough_data
# add clustering labels
borough_merged['Cluster Labels'] = kmeans.labels_
# merge borough_grouped with borough_data to add latitude/longitude for each neighborhood
borough_merged = borough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
borough_merged.head() # check the last columns!
# create map
map_clusters = folium.Map(location=[borough_lat, borough_lon], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_merged['Latitude'], borough_merged['Longitude'], borough_merged['Neighborhood'], borough_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_clusters)
map_clusters.save('borough_cluster_to.html')
map_clusters
borough_merged.loc[borough_merged['Cluster Labels'] == 0, borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
borough_merged.loc[borough_merged['Cluster Labels'] == 1, borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
borough_merged.loc[borough_merged['Cluster Labels'] == 2, borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
borough_merged.loc[borough_merged['Cluster Labels'] == 3, borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
borough_merged.loc[borough_merged['Cluster Labels'] == 4, borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
borough_merged['Distance'] = ''
nyc_latlon = str(nyc_lat) + ',' + str(nyc_lon)
for i, row in borough_merged.iterrows():
loc_latlon = borough_merged['Latitude'][i].astype(str) + ',' + borough_merged['Longitude'][i].astype(str)
borough_merged['Distance'][i] = geopy.distance.geodesic((loc_latlon), (nyc_latlon)).miles
borough_merged
borough_merged.drop(['Borough', 'Latitude', 'Longitude'], axis=1, inplace=True)
crime_aggr.drop(['Borough'], axis=1, inplace=True)
borough_merged.set_index('Neighborhood')
crime_aggr.set_index('Neighborhood')
borough_venues_aggr.set_index('Neighborhood')
from functools import reduce
data_frames = [borough_merged, crime_aggr, borough_venues_aggr]
queens_data = reduce(lambda left,right: pd.merge(left,right,on=['Neighborhood'],
how='left'), data_frames).fillna(0)
data_frames = [borough_merged, crime_aggr, borough_venues_aggr]
queens_data = reduce(lambda left,right: pd.merge(left,right,on=['Neighborhood'],
how='left'), data_frames).fillna(0)
queens_data.head(10)
queens_data[queens_data['Cluster Labels'] == 0]